import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
%matplotlib inline
#html export
import plotly.io as pio
pio.renderers.default = 'notebook'

Dataset creation¶
hiv_df = pd.read_csv(r'HIV data 2000-2023.csv', encoding='ISO-8859-1')
hiv_df.head()
| IndicatorCode | Indicator | ValueType | ParentLocationCode | ParentLocation | Location type | SpatialDimValueCode | Location | Period type | Period | Value | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | HIV_0000000001 | Estimated number of people (all ages) living w... | numeric | AFR | Africa | Country | AGO | Angola | Year | 2023 | 320 000 [280 000 - 380 000] |
| 1 | HIV_0000000001 | Estimated number of people (all ages) living w... | numeric | AFR | Africa | Country | AGO | Angola | Year | 2022 | 320 000 [280 000 - 380 000] |
| 2 | HIV_0000000001 | Estimated number of people (all ages) living w... | numeric | AFR | Africa | Country | AGO | Angola | Year | 2021 | 320 000 [280 000 - 380 000] |
| 3 | HIV_0000000001 | Estimated number of people (all ages) living w... | numeric | AFR | Africa | Country | AGO | Angola | Year | 2020 | 320 000 [280 000 - 370 000] |
| 4 | HIV_0000000001 | Estimated number of people (all ages) living w... | numeric | AFR | Africa | Country | AGO | Angola | Year | 2015 | 300 000 [260 000 - 350 000] |
Attributes¶
IndicatorCode: A unique identifier for the indicator being measured (e.g.,
"HIV_0000000001"for the estimated number of people living with HIV).Indicator: A description of the indicator being measured (e.g.,
"Estimated number of people (all ages) living with HIV").ValueType: Specifies the type of data recorded (e.g.,
"numeric"for numerical values).ParentLocationCode: A code representing the broader geographical region to which the location belongs (e.g.,
"AFR"for Africa).ParentLocation: The name of the broader geographical region (e.g.,
"Africa").Location type: Describes the type of location (e.g.,
"Country").SpatialDimValueCode: A unique code for the specific location (e.g.,
"AGO"for Angola).Location: The name of the specific location (e.g.,
"Angola").Period type: Specifies the type of time period (e.g.,
"Year").Period: The year for which the data is recorded (e.g.,
"2023").Value: The estimated number of people living with HIV, often including a range (e.g.,
"320 000 [280 000 - 380 000]"for Angola in 2023).
This dataset provides detailed information on HIV prevalence across various countries and regions, with data spanning multiple years (2000–2023). The Value field is particularly important as it contains the estimated figures, often accompanied by confidence intervals. The dataset is structured to allow analysis by region, country, and year.
Since the dataset focuses on an estimated number of people living with HIV, the columns IndicatorCode, Indicator, ValueType and Period type are redundant and do not add analytical value to trend analysis hence we drop them
# deleting redundant columns that do not add analytic value
del hiv_df['IndicatorCode']
del hiv_df['Indicator']
del hiv_df['ValueType']
del hiv_df['Period type']
hiv_df
| ParentLocationCode | ParentLocation | Location type | SpatialDimValueCode | Location | Period | Value | |
|---|---|---|---|---|---|---|---|
| 0 | AFR | Africa | Country | AGO | Angola | 2023 | 320 000 [280 000 - 380 000] |
| 1 | AFR | Africa | Country | AGO | Angola | 2022 | 320 000 [280 000 - 380 000] |
| 2 | AFR | Africa | Country | AGO | Angola | 2021 | 320 000 [280 000 - 380 000] |
| 3 | AFR | Africa | Country | AGO | Angola | 2020 | 320 000 [280 000 - 370 000] |
| 4 | AFR | Africa | Country | AGO | Angola | 2015 | 300 000 [260 000 - 350 000] |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1547 | WPR | Western Pacific | Country | WSM | Samoa | 2020 | No data |
| 1548 | WPR | Western Pacific | Country | WSM | Samoa | 2015 | No data |
| 1549 | WPR | Western Pacific | Country | WSM | Samoa | 2010 | No data |
| 1550 | WPR | Western Pacific | Country | WSM | Samoa | 2005 | No data |
| 1551 | WPR | Western Pacific | Country | WSM | Samoa | 2000 | No data |
1552 rows × 7 columns
hiv_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1552 entries, 0 to 1551 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ParentLocationCode 1552 non-null object 1 ParentLocation 1552 non-null object 2 Location type 1552 non-null object 3 SpatialDimValueCode 1552 non-null object 4 Location 1552 non-null object 5 Period 1552 non-null int64 6 Value 1552 non-null object dtypes: int64(1), object(6) memory usage: 85.0+ KB
Cleaning¶
# duplicates
hiv_df.duplicated().sum()
0
# nulls
hiv_df.isna().sum()
ParentLocationCode 0 ParentLocation 0 Location type 0 SpatialDimValueCode 0 Location 0 Period 0 Value 0 dtype: int64
There are no duplicates nor null values. However, some entries of the Value field have "No data" or placeholder values like "<200" for very small estimates so we need to clean it to have single values
# entries with the value "No data"
no_data = hiv_df[hiv_df['Value'] == 'No data']
no_data
| ParentLocationCode | ParentLocation | Location type | SpatialDimValueCode | Location | Period | Value | |
|---|---|---|---|---|---|---|---|
| 40 | AFR | Africa | Country | CAF | Central African Republic | 2023 | No data |
| 41 | AFR | Africa | Country | CAF | Central African Republic | 2022 | No data |
| 42 | AFR | Africa | Country | CAF | Central African Republic | 2021 | No data |
| 43 | AFR | Africa | Country | CAF | Central African Republic | 2020 | No data |
| 44 | AFR | Africa | Country | CAF | Central African Republic | 2015 | No data |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1547 | WPR | Western Pacific | Country | WSM | Samoa | 2020 | No data |
| 1548 | WPR | Western Pacific | Country | WSM | Samoa | 2015 | No data |
| 1549 | WPR | Western Pacific | Country | WSM | Samoa | 2010 | No data |
| 1550 | WPR | Western Pacific | Country | WSM | Samoa | 2005 | No data |
| 1551 | WPR | Western Pacific | Country | WSM | Samoa | 2000 | No data |
394 rows × 7 columns
# countries with No data on people living with HIV
no_data['Location'].unique()
array(['Central African Republic', 'Cameroon', 'Equatorial Guinea',
'Sao Tome and Principe', 'Seychelles', 'Antigua and Barbuda',
'Canada', 'Dominica', 'Grenada', 'Saint Kitts and Nevis',
'Saint Lucia', 'Trinidad and Tobago', 'United States of America',
'Saint Vincent and the Grenadines', 'Bahrain', 'Andorra',
'Austria', 'Belgium', 'Cyprus', 'Germany', 'Finland',
'United Kingdom of Great Britain and Northern Ireland', 'Hungary',
'Monaco', 'Netherlands (Kingdom of the)', 'Norway', 'Poland',
'Russian Federation', 'San Marino', 'Sweden', 'Turkmenistan',
'T\x9frkiye', 'Ukraine', 'Uzbekistan', 'India', 'Maldives',
"Democratic People's Republic of Korea", 'Brunei Darussalam',
'China', 'Cook Islands', 'Micronesia (Federated States of)',
'Japan', 'Kiribati', 'Republic of Korea', 'Marshall Islands',
'Niue', 'Nauru', 'Palau', 'Solomon Islands', 'Tonga', 'Tuvalu',
'Vanuatu', 'Samoa'], dtype=object)
# cleaning our dataset entries with the value "No data"
hiv_df = hiv_df[hiv_df['Value'] != 'No data']
hiv_df
| ParentLocationCode | ParentLocation | Location type | SpatialDimValueCode | Location | Period | Value | |
|---|---|---|---|---|---|---|---|
| 0 | AFR | Africa | Country | AGO | Angola | 2023 | 320 000 [280 000 - 380 000] |
| 1 | AFR | Africa | Country | AGO | Angola | 2022 | 320 000 [280 000 - 380 000] |
| 2 | AFR | Africa | Country | AGO | Angola | 2021 | 320 000 [280 000 - 380 000] |
| 3 | AFR | Africa | Country | AGO | Angola | 2020 | 320 000 [280 000 - 370 000] |
| 4 | AFR | Africa | Country | AGO | Angola | 2015 | 300 000 [260 000 - 350 000] |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1531 | WPR | Western Pacific | Country | VNM | Viet Nam | 2020 | 250 000 [230 000 - 270 000] |
| 1532 | WPR | Western Pacific | Country | VNM | Viet Nam | 2015 | 240 000 [210 000 - 260 000] |
| 1533 | WPR | Western Pacific | Country | VNM | Viet Nam | 2010 | 210 000 [190 000 - 230 000] |
| 1534 | WPR | Western Pacific | Country | VNM | Viet Nam | 2005 | 180 000 [150 000 - 200 000] |
| 1535 | WPR | Western Pacific | Country | VNM | Viet Nam | 2000 | 120 000 [100 000 - 140 000] |
1158 rows × 7 columns
We have dropped 394 columns with 'No Data' as their value for people living with HIV
# extract the central estimate (320000) as a number for values with the format: 320 000 [280 000 - 380 000]
def extract_value(val):
if isinstance(val, str):
# Handle values like "<500" at the beginning
if val.startswith('<'):
number = int(val[1:].split()[0]) # take just the number part before any space
return number - 1 # assume just under that number
else:
# Extract the number at the start if it's not a "<" value
match = pd.Series(val).str.extract(r'^([\d\s]+)').iloc[0, 0]
if match:
return float(match.replace(' ', '')) # remove spaces, convert to float
return None # fallback if no match
# Apply to value column and convert to int
hiv_df.loc[:, 'Value'] = hiv_df['Value'].apply(extract_value)
hiv_df['Value'] = hiv_df['Value'].astype(int)
hiv_df
C:\Users\deninjo\AppData\Local\Temp\ipykernel_6524\1950752774.py:18: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
| ParentLocationCode | ParentLocation | Location type | SpatialDimValueCode | Location | Period | Value | |
|---|---|---|---|---|---|---|---|
| 0 | AFR | Africa | Country | AGO | Angola | 2023 | 320000 |
| 1 | AFR | Africa | Country | AGO | Angola | 2022 | 320000 |
| 2 | AFR | Africa | Country | AGO | Angola | 2021 | 320000 |
| 3 | AFR | Africa | Country | AGO | Angola | 2020 | 320000 |
| 4 | AFR | Africa | Country | AGO | Angola | 2015 | 300000 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1531 | WPR | Western Pacific | Country | VNM | Viet Nam | 2020 | 250000 |
| 1532 | WPR | Western Pacific | Country | VNM | Viet Nam | 2015 | 240000 |
| 1533 | WPR | Western Pacific | Country | VNM | Viet Nam | 2010 | 210000 |
| 1534 | WPR | Western Pacific | Country | VNM | Viet Nam | 2005 | 180000 |
| 1535 | WPR | Western Pacific | Country | VNM | Viet Nam | 2000 | 120000 |
1158 rows × 7 columns
hiv_df.info()
<class 'pandas.core.frame.DataFrame'> Index: 1158 entries, 0 to 1535 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ParentLocationCode 1158 non-null object 1 ParentLocation 1158 non-null object 2 Location type 1158 non-null object 3 SpatialDimValueCode 1158 non-null object 4 Location 1158 non-null object 5 Period 1158 non-null int64 6 Value 1158 non-null int32 dtypes: int32(1), int64(1), object(5) memory usage: 67.9+ KB

Question One¶
Create a visualization that shows the trend of HIV cases in the countries that contribute to 75% of the global burden¶
We'll first group the cleaned dataset by Location and Period, summing up Value.
# 1. Group by year and country to get total HIV cases per country per year
country_year_df = hiv_df.groupby(['Location', 'Period'])['Value'].sum().reset_index()
country_year_df
| Location | Period | Value | |
|---|---|---|---|
| 0 | Afghanistan | 2000 | 1600 |
| 1 | Afghanistan | 2005 | 2800 |
| 2 | Afghanistan | 2010 | 4100 |
| 3 | Afghanistan | 2015 | 6500 |
| 4 | Afghanistan | 2020 | 10000 |
| ... | ... | ... | ... |
| 1153 | Zimbabwe | 2015 | 1400000 |
| 1154 | Zimbabwe | 2020 | 1300000 |
| 1155 | Zimbabwe | 2021 | 1300000 |
| 1156 | Zimbabwe | 2022 | 1300000 |
| 1157 | Zimbabwe | 2023 | 1300000 |
1158 rows × 3 columns
We then calculate Total Global HIV Cases Per Year. This helps us determine each country’s contribution to the global burden for each year.
# 2. Total global cases per year
global_yearly_total = country_year_df.groupby('Period')['Value'].sum().reset_index()
global_yearly_total.rename(columns={'Value': 'GlobalTotal'}, inplace=True)
global_yearly_total
| Period | GlobalTotal | |
|---|---|---|
| 0 | 2000 | 21353916 |
| 1 | 2005 | 23492604 |
| 2 | 2010 | 26058607 |
| 3 | 2015 | 28925842 |
| 4 | 2020 | 31016146 |
| 5 | 2021 | 31314227 |
| 6 | 2022 | 31198307 |
| 7 | 2023 | 33933987 |
Merge global total and find country percentage
For each year, we already know the total number of HIV cases globally (
GlobalTotal).Now, we add that total next to each country’s number for that year.
Then, we calculate what percentage each country contributes that year:
Example: If Kenya had 100,000 cases and the world had 1,000,000 cases that year → Kenya’s share = 10%.
# 3. Merge global total back to country-level data
merged_df = country_year_df.merge(global_yearly_total, on='Period')
merged_df['Percent'] = merged_df['Value'] / merged_df['GlobalTotal'] * 100
merged_df
| Location | Period | Value | GlobalTotal | Percent | |
|---|---|---|---|---|---|
| 0 | Afghanistan | 2000 | 1600 | 21353916 | 0.007493 |
| 1 | Afghanistan | 2005 | 2800 | 23492604 | 0.011919 |
| 2 | Afghanistan | 2010 | 4100 | 26058607 | 0.015734 |
| 3 | Afghanistan | 2015 | 6500 | 28925842 | 0.022471 |
| 4 | Afghanistan | 2020 | 10000 | 31016146 | 0.032241 |
| ... | ... | ... | ... | ... | ... |
| 1153 | Zimbabwe | 2015 | 1400000 | 28925842 | 4.839963 |
| 1154 | Zimbabwe | 2020 | 1300000 | 31016146 | 4.191365 |
| 1155 | Zimbabwe | 2021 | 1300000 | 31314227 | 4.151468 |
| 1156 | Zimbabwe | 2022 | 1300000 | 31198307 | 4.166893 |
| 1157 | Zimbabwe | 2023 | 1300000 | 33933987 | 3.830967 |
1158 rows × 5 columns
Instead of picking the biggest contributors only based on today (latest year) or on average across years, we are now looking at the total number of cases a country has contributed from 2000 to 2023.
This way, it tells us which countries have been the biggest contributors overall — not just recently, and favors countries that had high numbers consistently across many years, not just a sudden rise or fall.
# 4. Sum total cases per country across all years
country_total = merged_df.groupby('Location')['Value'].sum().reset_index()
global_total_sum = global_yearly_total['GlobalTotal'].sum()
country_total
| Location | Value | |
|---|---|---|
| 0 | Afghanistan | 61000 |
| 1 | Albania | 8208 |
| 2 | Algeria | 126100 |
| 3 | Angola | 2160000 |
| 4 | Argentina | 859000 |
| ... | ... | ... |
| 142 | Venezuela (Bolivarian Republic of) | 724000 |
| 143 | Viet Nam | 1750000 |
| 144 | Yemen | 78100 |
| 145 | Zambia | 8890000 |
| 146 | Zimbabwe | 10900000 |
147 rows × 2 columns
We then find the percentage each country contributed to this global total
# 5. Calculate percent contribution of each country
country_total['Percent'] = country_total['Value'] / global_total_sum * 100
country_total
| Location | Value | Percent | |
|---|---|---|---|
| 0 | Afghanistan | 61000 | 0.026838 |
| 1 | Albania | 8208 | 0.003611 |
| 2 | Algeria | 126100 | 0.055479 |
| 3 | Angola | 2160000 | 0.950313 |
| 4 | Argentina | 859000 | 0.377925 |
| ... | ... | ... | ... |
| 142 | Venezuela (Bolivarian Republic of) | 724000 | 0.318531 |
| 143 | Viet Nam | 1750000 | 0.769929 |
| 144 | Yemen | 78100 | 0.034361 |
| 145 | Zambia | 8890000 | 3.911240 |
| 146 | Zimbabwe | 10900000 | 4.795559 |
147 rows × 3 columns
Filter Top Contributors Covering 75% of Global Burden by sorting countries by their contribution and cumulatively sum their % share until reaching 75%.
# 6. Sort and compute cumulative percent
country_total = country_total.sort_values('Percent', ascending=False)
country_total['CumulativePercent'] = country_total['Percent'].cumsum()
# 7. Select countries contributing to 75% of total global cases
top_countries = country_total[country_total['CumulativePercent'] <= 75]['Location'].tolist()
top_countries
['South Africa', 'Mozambique', 'Nigeria', 'Kenya', 'United Republic of Tanzania', 'Zimbabwe', 'Uganda', 'Zambia', 'Malawi', 'Brazil', 'Thailand', 'Ethiopia', 'Democratic Republic of the Congo', "Cote d'Ivoire"]
# 8. Filter original country-year data for those top countries
top_country_trend = country_year_df[country_year_df['Location'].isin(top_countries)]
# 9. Plot the trend
fig = px.line(top_country_trend,
x='Period',
y='Value',
color='Location',
title='Trend of HIV Cases in Top Contributing Countries (75% of Cumulative Global Cases)',
labels={'Value': 'Estimated HIV Cases', 'Period': 'Year'})
fig.show()
Generate a visualization that displays the trend of HIV cases in the countries contributing to 75% of the burden within each WHO region (column called ParentLocationCode contains the WHO regions)¶
Instead of asking "Who are the biggest players globally?" we are now trying to find out, "Who are the biggest players inside each WHO region?".
First and foremost, for each WHO region, country, and year, we sum the HIV cases.
# 1. Group by year, country, and WHO region to get total HIV cases per country per year
country_year_df = hiv_df.groupby(['ParentLocationCode', 'Location', 'Period'])['Value'].sum().reset_index()
country_year_df
| ParentLocationCode | Location | Period | Value | |
|---|---|---|---|---|
| 0 | AFR | Algeria | 2000 | 2400 |
| 1 | AFR | Algeria | 2005 | 5700 |
| 2 | AFR | Algeria | 2010 | 10000 |
| 3 | AFR | Algeria | 2015 | 15000 |
| 4 | AFR | Algeria | 2020 | 21000 |
| ... | ... | ... | ... | ... |
| 1153 | WPR | Viet Nam | 2015 | 240000 |
| 1154 | WPR | Viet Nam | 2020 | 250000 |
| 1155 | WPR | Viet Nam | 2021 | 250000 |
| 1156 | WPR | Viet Nam | 2022 | 250000 |
| 1157 | WPR | Viet Nam | 2023 | 250000 |
1158 rows × 4 columns
Now, for each WHO region and year, we find the total HIV cases (summing all countries in the region) so that we can find out each country's share within its region later on
# 2. Total regional cases per year
regional_yearly_total = country_year_df.groupby(['ParentLocationCode', 'Period'])['Value'].sum().reset_index()
regional_yearly_total.rename(columns={'Value': 'RegionalTotal'}, inplace=True)
regional_yearly_total
| ParentLocationCode | Period | RegionalTotal | |
|---|---|---|---|
| 0 | AFR | 2000 | 18088999 |
| 1 | AFR | 2005 | 19435099 |
| 2 | AFR | 2010 | 21219799 |
| 3 | AFR | 2015 | 23436199 |
| 4 | AFR | 2020 | 24838099 |
| 5 | AFR | 2021 | 25025199 |
| 6 | AFR | 2022 | 25102299 |
| 7 | AFR | 2023 | 25199599 |
| 8 | AMR | 2000 | 1152900 |
| 9 | AMR | 2005 | 1459800 |
| 10 | AMR | 2010 | 1773900 |
| 11 | AMR | 2015 | 2091600 |
| 12 | AMR | 2020 | 2455300 |
| 13 | AMR | 2021 | 2480700 |
| 14 | AMR | 2022 | 2558000 |
| 15 | AMR | 2023 | 2631200 |
| 16 | EMR | 2000 | 98893 |
| 17 | EMR | 2005 | 158135 |
| 18 | EMR | 2010 | 243666 |
| 19 | EMR | 2015 | 313167 |
| 20 | EMR | 2020 | 422730 |
| 21 | EMR | 2021 | 452150 |
| 22 | EMR | 2022 | 486170 |
| 23 | EMR | 2023 | 529280 |
| 24 | EUR | 2000 | 577537 |
| 25 | EUR | 2005 | 746342 |
| 26 | EUR | 2010 | 937863 |
| 27 | EUR | 2015 | 1057376 |
| 28 | EUR | 2020 | 1165097 |
| 29 | EUR | 2021 | 1184638 |
| 30 | EUR | 2022 | 864688 |
| 31 | EUR | 2023 | 849238 |
| 32 | SEAR | 2000 | 1147589 |
| 33 | SEAR | 2005 | 1312330 |
| 34 | SEAR | 2010 | 1427320 |
| 35 | SEAR | 2015 | 1478780 |
| 36 | SEAR | 2020 | 1481300 |
| 37 | SEAR | 2021 | 1491400 |
| 38 | SEAR | 2022 | 1482600 |
| 39 | SEAR | 2023 | 3982700 |
| 40 | WPR | 2000 | 287998 |
| 41 | WPR | 2005 | 380898 |
| 42 | WPR | 2010 | 456059 |
| 43 | WPR | 2015 | 548720 |
| 44 | WPR | 2020 | 653620 |
| 45 | WPR | 2021 | 680140 |
| 46 | WPR | 2022 | 704550 |
| 47 | WPR | 2023 | 741970 |
We then find the percentage each country contributed, each year inside its region.
# 3. Merge regional total back to country-level data
merged_df = country_year_df.merge(regional_yearly_total, on=['ParentLocationCode', 'Period'])
merged_df['Percent'] = merged_df['Value'] / merged_df['RegionalTotal'] * 100
merged_df
| ParentLocationCode | Location | Period | Value | RegionalTotal | Percent | |
|---|---|---|---|---|---|---|
| 0 | AFR | Algeria | 2000 | 2400 | 18088999 | 0.013268 |
| 1 | AFR | Algeria | 2005 | 5700 | 19435099 | 0.029328 |
| 2 | AFR | Algeria | 2010 | 10000 | 21219799 | 0.047126 |
| 3 | AFR | Algeria | 2015 | 15000 | 23436199 | 0.064004 |
| 4 | AFR | Algeria | 2020 | 21000 | 24838099 | 0.084548 |
| ... | ... | ... | ... | ... | ... | ... |
| 1153 | WPR | Viet Nam | 2015 | 240000 | 548720 | 43.738154 |
| 1154 | WPR | Viet Nam | 2020 | 250000 | 653620 | 38.248524 |
| 1155 | WPR | Viet Nam | 2021 | 250000 | 680140 | 36.757138 |
| 1156 | WPR | Viet Nam | 2022 | 250000 | 704550 | 35.483642 |
| 1157 | WPR | Viet Nam | 2023 | 250000 | 741970 | 33.694085 |
1158 rows × 6 columns
For each country and region, sum all HIV cases across all years (2000–2023). This tells US how big a contributor the country is overall, not just in a year.
# 4. Sum Cases Across All Years
total_contribution = merged_df.groupby(['ParentLocationCode', 'Location'])['Value'].sum().reset_index()
total_contribution
| ParentLocationCode | Location | Value | |
|---|---|---|---|
| 0 | AFR | Algeria | 126100 |
| 1 | AFR | Angola | 2160000 |
| 2 | AFR | Benin | 542000 |
| 3 | AFR | Botswana | 2680000 |
| 4 | AFR | Burkina Faso | 901000 |
| ... | ... | ... | ... |
| 142 | WPR | New Zealand | 22600 |
| 143 | WPR | Papua New Guinea | 392000 |
| 144 | WPR | Philippines | 686400 |
| 145 | WPR | Singapore | 45000 |
| 146 | WPR | Viet Nam | 1750000 |
147 rows × 3 columns
We then calculate the Total Sum per Region
# 5. Get total sum per region for percentage calculation
regional_total_sum = total_contribution.groupby('ParentLocationCode')['Value'].sum().reset_index()
regional_total_sum.rename(columns={'Value': 'RegionalTotalSum'}, inplace=True)
regional_total_sum
| ParentLocationCode | RegionalTotalSum | |
|---|---|---|
| 0 | AFR | 182345292 |
| 1 | AMR | 16603400 |
| 2 | EMR | 2704191 |
| 3 | EUR | 7382779 |
| 4 | SEAR | 13804019 |
| 5 | WPR | 4453955 |
We merge the total region sums into country-level sums then calculate what % each country contributed overall to its WHO region (across all years).
# 6. Merge to calculate each country's % contribution within its region
total_contribution = total_contribution.merge(regional_total_sum, on='ParentLocationCode')
total_contribution['Percent'] = total_contribution['Value'] / total_contribution['RegionalTotalSum'] * 100
total_contribution
| ParentLocationCode | Location | Value | RegionalTotalSum | Percent | |
|---|---|---|---|---|---|
| 0 | AFR | Algeria | 126100 | 182345292 | 0.069155 |
| 1 | AFR | Angola | 2160000 | 182345292 | 1.184566 |
| 2 | AFR | Benin | 542000 | 182345292 | 0.297238 |
| 3 | AFR | Botswana | 2680000 | 182345292 | 1.469739 |
| 4 | AFR | Burkina Faso | 901000 | 182345292 | 0.494118 |
| ... | ... | ... | ... | ... | ... |
| 142 | WPR | New Zealand | 22600 | 4453955 | 0.507414 |
| 143 | WPR | Papua New Guinea | 392000 | 4453955 | 8.801167 |
| 144 | WPR | Philippines | 686400 | 4453955 | 15.411022 |
| 145 | WPR | Singapore | 45000 | 4453955 | 1.010338 |
| 146 | WPR | Viet Nam | 1750000 | 4453955 | 39.290922 |
147 rows × 5 columns
Sort countries inside each WHO region from highest contributor to lowest, then cumulatively add up percentages within each region.
# 7. Sort and get cumulative percentage
total_contribution = total_contribution.sort_values(['ParentLocationCode', 'Percent'], ascending=[True, False])
total_contribution['CumulativePercent'] = total_contribution.groupby('ParentLocationCode')['Percent'].cumsum()
total_contribution
| ParentLocationCode | Location | Value | RegionalTotalSum | Percent | CumulativePercent | |
|---|---|---|---|---|---|---|
| 35 | AFR | South Africa | 51400000 | 182345292 | 28.188279 | 28.188279 |
| 28 | AFR | Mozambique | 15150000 | 182345292 | 8.308413 | 36.496692 |
| 31 | AFR | Nigeria | 15000000 | 182345292 | 8.226152 | 44.722844 |
| 20 | AFR | Kenya | 12000000 | 182345292 | 6.580921 | 51.303765 |
| 39 | AFR | United Republic of Tanzania | 11600000 | 182345292 | 6.361557 | 57.665322 |
| ... | ... | ... | ... | ... | ... | ... |
| 139 | WPR | Lao People's Democratic Republic | 104400 | 4453955 | 2.343984 | 98.133008 |
| 145 | WPR | Singapore | 45000 | 4453955 | 1.010338 | 99.143346 |
| 142 | WPR | New Zealand | 22600 | 4453955 | 0.507414 | 99.650760 |
| 138 | WPR | Fiji | 11358 | 4453955 | 0.255009 | 99.905769 |
| 141 | WPR | Mongolia | 4197 | 4453955 | 0.094231 | 100.000000 |
147 rows × 6 columns
Keep only the top countries whose cumulative contribution is <= 75% in each WHO region.
# 8. Select countries contributing to 75% within each region
top_countries_per_region = (
total_contribution[total_contribution['CumulativePercent'] <= 75]
)
top_countries_per_region
| ParentLocationCode | Location | Value | RegionalTotalSum | Percent | CumulativePercent | |
|---|---|---|---|---|---|---|
| 35 | AFR | South Africa | 51400000 | 182345292 | 28.188279 | 28.188279 |
| 28 | AFR | Mozambique | 15150000 | 182345292 | 8.308413 | 36.496692 |
| 31 | AFR | Nigeria | 15000000 | 182345292 | 8.226152 | 44.722844 |
| 20 | AFR | Kenya | 12000000 | 182345292 | 6.580921 | 51.303765 |
| 39 | AFR | United Republic of Tanzania | 11600000 | 182345292 | 6.361557 | 57.665322 |
| 41 | AFR | Zimbabwe | 10900000 | 182345292 | 5.977670 | 63.642992 |
| 38 | AFR | Uganda | 10500000 | 182345292 | 5.758306 | 69.401298 |
| 40 | AFR | Zambia | 8890000 | 182345292 | 4.875366 | 74.276664 |
| 47 | AMR | Brazil | 6070000 | 16603400 | 36.558777 | 36.558777 |
| 61 | AMR | Mexico | 2190000 | 16603400 | 13.190070 | 49.748847 |
| 50 | AMR | Colombia | 1430000 | 16603400 | 8.612694 | 58.361540 |
| 58 | AMR | Haiti | 1010000 | 16603400 | 6.083091 | 64.444632 |
| 42 | AMR | Argentina | 859000 | 16603400 | 5.173639 | 69.618271 |
| 68 | AMR | Venezuela (Bolivarian Republic of) | 724000 | 16603400 | 4.360553 | 73.978824 |
| 80 | EMR | Pakistan | 1247600 | 2704191 | 46.135794 | 46.135794 |
| 72 | EMR | Iran (Islamic Republic of) | 327000 | 2704191 | 12.092341 | 58.228136 |
| 84 | EMR | Sudan | 273000 | 2704191 | 10.095441 | 68.323576 |
| 71 | EMR | Egypt | 154400 | 2704191 | 5.709656 | 74.033232 |
| 125 | EUR | Ukraine | 1430000 | 7382779 | 19.369400 | 19.369400 |
| 99 | EUR | France | 1285000 | 7382779 | 17.405370 | 36.774770 |
| 122 | EUR | Spain | 1090000 | 7382779 | 14.764088 | 51.538858 |
| 106 | EUR | Italy | 991000 | 7382779 | 13.423130 | 64.961988 |
| 116 | EUR | Portugal | 344000 | 7382779 | 4.659492 | 69.621480 |
| 101 | EUR | Germany | 323000 | 7382779 | 4.375046 | 73.996526 |
| 134 | SEAR | Thailand | 5430000 | 13804019 | 39.336370 | 39.336370 |
| 130 | SEAR | Indonesia | 3513000 | 13804019 | 25.449110 | 64.785480 |
| 146 | WPR | Viet Nam | 1750000 | 4453955 | 39.290922 | 39.290922 |
| 144 | WPR | Philippines | 686400 | 4453955 | 15.411022 | 54.701945 |
| 137 | WPR | Cambodia | 639000 | 4453955 | 14.346800 | 69.048744 |
# 9. Merge ParentLocation (full name) into top_countries_per_region
# get ParentLocation mappings from hiv_df
region_names = hiv_df[['ParentLocationCode', 'ParentLocation']].drop_duplicates()
# Merge region names into your top_countries_per_region
top_countries_per_region = top_countries_per_region.merge(region_names, on='ParentLocationCode', how='left')
# 10. Now filter your original merged_df for only the top countries
top_country_trend = merged_df[
merged_df['Location'].isin(top_countries_per_region['Location'])
]
# Also merge to get ParentLocation for plotting
top_country_trend = top_country_trend.merge(region_names, on='ParentLocationCode', how='left')
# 9. Plot the trend
fig = px.line(
top_country_trend,
x='Period',
y='Value',
color='Location',
facet_col='ParentLocation', # <--- This will separate by WHO region nicely
facet_col_wrap=2, # Wrap facets into multiple rows if too many regions
title='Trend of HIV Cases in Top Contributing Countries (75% Global Burden per Region)',
labels={
'Value': 'Estimated HIV Cases',
'Period': 'Year',
'Location': 'Country'
}
)
fig.update_layout(height=800)
fig.show()